# Purpose: Run hedonic regressions to estimate whether home prices are responsive to building code requirements
pacman::p_load(tidyverse, fixest, fst, DescTools, data.table)

# Setup -----
rm(list = ls())

source("code/globals.R")

# Load statewide sample -----

assessment <- read_fst(file.path(WORKING, "statewide-sample.fst"),
  as.data.table = T,
  to = NULL
)

# Load sales data for all CA homes ----
trans <- read_fst(file.path(WORKING, "trans_clean.fst"),
  columns = c("TransId", "FIPS", "state", "RecordingDate", "DocumentDate", "SalesPriceAmount", "SalesPriceAmountStndCode", "salesprice_quality", "PropertySequenceNumber", "ImportParcelID", "AssessmentLandUseStndCode", "bad_assessment_use", "bad_dataclass", "bad_doctype", "bad_loantype", "bad_partialinterest", "bad_pricecode", "bad_priceamt", "bad_ztrax_family"),
  as.data.table = T,
  to = NULL
)

trans[, bad_obs := bad_assessment_use | bad_dataclass | bad_doctype | bad_loantype | bad_partialinterest | bad_priceamt | bad_ztrax_family]

# Drop any bad observations (this drops the majority, since the majority of transactions do not fit our requirements)
trans <- trans[bad_obs == F]

# Drop all flags
trans <- trans[, `:=`(
  bad_obs = NULL,
  bad_assessment_use = NULL,
  bad_dataclass = NULL,
  bad_doctype = NULL,
  bad_loantype = NULL,
  bad_partialinterest = NULL,
  bad_priceamt = NULL,
  bad_pricecode = NULL,
  bad_ztrax_family = NULL
)]

# Also drop any transactions that don't have sales price since we need price for hedonic regs
trans <- trans[complete.cases(SalesPriceAmount)]

# Ref: https://www.zillow.com/research/ztrax/ztrax-faqs/
trans[, sale_date := DocumentDate]
trans[is.na(sale_date), sale_date := RecordingDate]
trans[, sale_year := year(sale_date)]

# Create a sale year period variable analogous to the one we use for vintage in the threatened homes sample
trans[sale_year < 1998, sale_period := 0]
trans[sale_year >= 1998 & sale_year < 2008, sale_period := 1]
trans[sale_year >= 2008, sale_period := 2]
trans[, sale_period := factor(sale_period, levels = c(0, 1, 2))]

# Count # sales per parcel
trans[, n_sales := .N, by = ImportParcelID]

# Clean up sale prices ----

# Load CA HPI, average across year
ca_hpi <- read_csv(file.path(INPUT_PUBLIC, "CA-HPI.csv")) %>%
  group_by(sale_year = year(DATE)) %>%
  summarize(hpi = mean(CASTHPI, na.rm = T))
ca_hpi_2015 <- ca_hpi %>%
  filter(sale_year == 2015) %>%
  pull(hpi)

# Adjust sales price for inflation
trans <- merge(trans, ca_hpi, by = "sale_year", all.x = T)
trans[, sales_price_real := SalesPriceAmount * (ca_hpi_2015 / hpi)]

# Winsorize sale prices at 1% and 99% quantiles
trans[, sales_price_w := Winsorize(sales_price_real, val = quantile(sales_price_real, c(0.01, 0.99), na.rm = T))]

# Get log(price)
trans[, logprice := log(sales_price_real)]
trans[, logpricew := log(sales_price_w)]

# Combine assessment and transactions for all parcels ----

sales <- merge(trans, assessment, by = c("FIPS", "ImportParcelID"))

# Save
write_fst(sales, file.path(WORKING, "allCA-sales-clean.fst"))
